liz's portfolio

import numpy as np
import matplotlib.pyplot as plt  # To visualize
import pandas as pd  # To read data
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from stargazer.stargazer import Stargazer
from IPython.core.display import HTML
import seaborn as sn

data = pd.read_stata('berkeley.dta', preserve_dtypes=False)

data['gender'].describe()

count     4526
unique       2
top       Male
freq      2691
Name: gender, dtype: object

gender = {'Male': 0,'Female': 1}
data.gender = [gender[item] for item in data.gender]
admit = {'Rejected': 0,'Admitted': 1}
data.admit = [admit[item] for item in data.admit]
dept = {'A': 0, 'B': 1, 'C':2, 'D': 3, 'E': 4, 'F': 5}
data.dept = [dept[item] for item in data.dept]

onlywomen = data.loc[data['gender'] == 1].copy()
onlywomen['admit'].describe()

count    1835.000000
mean        0.303542
std         0.459913
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: admit, dtype: float64

data.agg(
    {
        "gender": ["min", "max", "median", "skew"],
        "admit": ["min", "max", "median", "mean"],
    }
)

	gender	admit
min	0.000000	0.00000
max	1.000000	1.00000
median	0.000000	0.00000
skew	0.385339	NaN
mean	NaN	0.38776

data.dtypes

applicant     int64
admit         int64
gender        int64
dept         object
dtype: object

LR = LinearRegression()  # create object for the class

X = data['gender'].values.reshape(-1, 1)  # values converts it into a numpy array
y = data['admit'].values.reshape(-1, 1)  # values converts it into a numpy array

LR.fit(X, y)

LinearRegression()

print('Coefficients: \n', LR.coef_)

Coefficients: 
 [[-0.14164543]]

X = sm.add_constant(X)

model = sm.OLS(data['admit'], data['gender']).fit()

model.summary()

OLS Regression Results
Dep. Variable:	admit	R-squared (uncentered):	0.096
Model:	OLS	Adj. R-squared (uncentered):	0.096
Method:	Least Squares	F-statistic:	482.4
Date:	Sun, 10 Oct 2021	Prob (F-statistic):	1.11e-101
Time:	18:52:17	Log-Likelihood:	-4049.0
No. Observations:	4526	AIC:	8100.
Df Residuals:	4525	BIC:	8106.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
gender	0.3035	0.014	21.964	0.000	0.276	0.331

Omnibus:	24363.738	Durbin-Watson:	0.015
Prob(Omnibus):	0.000	Jarque-Bera (JB):	574.437
Skew:	0.385	Prob(JB):	1.83e-125
Kurtosis:	1.434	Cond. No.	1.00

Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.

data['dept'].unique()

array(['A', 'B', 'C', 'D', 'E', 'F'], dtype=object)

data.describe()

	applicant	admit	gender	dept
count	4526.000000	4526.000000	4526.000000	4526.000000
mean	2263.500000	0.387760	0.405435	2.364781
std	1306.687989	0.487293	0.491030	1.712402
min	1.000000	0.000000	0.000000	0.000000
25%	1132.250000	0.000000	0.000000	1.000000
50%	2263.500000	0.000000	0.000000	2.000000
75%	3394.750000	1.000000	1.000000	4.000000
max	4526.000000	1.000000	1.000000	5.000000

Income_Gini = sm.OLS(data['Gini'], sm.add_constant(data['Income_Per_Capita'])).fit()